见“各个模型”里详细的

分割数据集和测试集

# to make this notebook's output identical at every run
np.random.seed(42)

import numpy as np

# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

    from zlib import crc32
#这个方法可行，但是并不完美：如果再次运行程序，就会产生一个不同的测试集。多次运行之后，你（或你的机器学习算法）就会得到整个数据集，这是需要避免的。

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]



housing_with_id = housing.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
#第二种增加ID的方式
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")


#如果你想简单地拆分数据做预测模型示例，使用split_train_test进行拆分即可。Scikit-Learn 提供了一些函数，可以用多种方式将数据集分割成多个子集。最简单的函数是train_test_split，它的作用和之前的函数split_train_test很像，并带有其它一些功能。首先，它有一个random_state参数，可以设定前面讲过的随机生成器种子；第二，你可以将种子传递到多个行数相同的数据集，可以在相同的索引上分割数据集（这个功能非常有用，比如你有另一个DataFrame作为标签）：
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

另外一种拆分形式：分层采样

将人群分成均匀的子分组，称为分层，从每个分层去除合适数量的实例，以保证测试集对总人数有代表性。例如，美国人口的 51.3% 是女性，48.7% 是男性。所以在美国，严谨的调查需要保证样本也是这个比例：513 名女性，487 名男性作为数据样本。

数据集中的每个分层都要有足够的实例位于你的数据中，这点很重要。否则，对分层重要性的评估就会有偏差。这意味着，你不能有过多的分层，且每个分层都要足够大。后面的代码通过将收入中位数除以 1.5（以限制收入分类的数量），创建了一个收入类别属性，用ceil对值舍入（以产生离散的分类），然后将所有大于 5的分类归入到分类5 ：

# 预处理，创建"income_cat"属性 
# 凡是会对原数组作出修改并返回一个新数组的，往往都有一个 inplace可选参数
# inplace=True,原数组名对应的内存值直接改变;inplace=False,原数组名对应的内存值并不改变，新的结果赋给一个新的数组.
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
# 现在，就可以根据收入分类，进行分层采样。你可以使用 Scikit-Learn 的StratifiedShuffleSplit类
from sklearn.model_selection import StratifiedShuffleSplit

# random_state为随机种子生成器，可以得到相同的随机结果
# n_splits是将训练数据分成train/test对的组数，这里汇总成一组数据
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)    

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# 现在，你需要删除income_cat属性，使数据回到初始状态：    
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

Question

# 注
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)#参数 random_state控制是将样本随机打乱
# 之前用此函数取的housing，即housing经过分层采样，去掉median_value,经过转换流水线
# 也有np.random.seed(42)，然后再调用np.random.permutation(len(data))用以分割啥的

打乱顺序

import numpy as np

shuffle_index =np.random.permutation(60000)
x_train, y_train = x_train[shuffle_inde], y_train[shuffle_index]

对X进行特征缩放

from sklearn.preprocessing import StandardScaler
scaler = StandardScalar()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))